Background

Tutorial for the WGCNA CRAN package (https://CRAN.R-project.org/package=WGCNA) adapted from ‘Tutorials for the WGCNA package’ (https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html).

Simulation of expression and trait data

library(WGCNA)
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
## 
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
## 
##     hclust
## 
## 
## Attaching package: 'WGCNA'
## The following object is masked from 'package:stats':
## 
##     cor
# Here are input parameters of the simulation model
# number of samples or microarrays in the training data
no.obs=50
# now we specify the true measures of eigengene significance
# recall that ESturquoise=cor(y,MEturquoise)
ESturquoise=0;   ESbrown= -.6;
ESgreen=.6;ESyellow=0
# Note that we don’t specify the eigengene significance of the blue module
# since it is highly correlated with the turquoise module.
ESvector=c(ESturquoise,ESbrown,ESgreen,ESyellow)
# number of genes 
nGenes1=3000
# proportion of genes in the turquoise, blue, brown, green, and yellow module #respectively.
simulateProportions1=c(0.2,0.15, 0.08, 0.06, 0.04)
# Note that the proportions don’t add up to 1. The remaining genes will be colored grey,
# ie the grey genes are non-module genes.
# set the seed of the random number generator. As a homework exercise change this seed.
set.seed(1)
#Step 1: simulate a module eigengene network.
# Training Data Set I
MEgreen=rnorm(no.obs)
scaledy=MEgreen*ESgreen+sqrt(1-ESgreen^2)*rnorm(no.obs)
y=ifelse( scaledy>median(scaledy),2,1)
MEturquoise= ESturquoise*scaledy+sqrt(1-ESturquoise^2)*rnorm(no.obs)
# we simulate a strong dependence between MEblue and MEturquoise
MEblue= .6*MEturquoise+ sqrt(1-.6^2) *rnorm(no.obs)
MEbrown= ESbrown*scaledy+sqrt(1-ESbrown^2)*rnorm(no.obs)
MEyellow= ESyellow*scaledy+sqrt(1-ESyellow^2)*rnorm(no.obs)
ModuleEigengeneNetwork1=data.frame(y,MEturquoise,MEblue,MEbrown,MEgreen, MEyellow)
ModuleEigengeneNetwork1[1:3, ]
##   y MEturquoise      MEblue  MEbrown    MEgreen    MEyellow
## 1 1 -0.62036668 -0.01207033 0.361954 -0.6264538  0.13622189
## 2 1  0.04211587  0.01042166 1.578760  0.1836433  0.40716760
## 3 1 -0.91092165 -0.80100769 1.406360 -0.8356286 -0.06965481
nrow(ModuleEigengeneNetwork1)
## [1] 50
dat1=simulateDatExpr5Modules(MEturquoise=ModuleEigengeneNetwork1$MEturquoise,
          MEblue=ModuleEigengeneNetwork1$MEblue,
          MEbrown=ModuleEigengeneNetwork1$MEbrown,
          MEyellow=ModuleEigengeneNetwork1$MEyellow,
          MEgreen=ModuleEigengeneNetwork1$MEgreen, 
          nGenes=nGenes1, 
          simulateProportions=simulateProportions1)

class(dat1)
## [1] "list"
str(dat1)
## List of 3
##  $ datExpr   : num [1:50, 1:3000] -0.769 0.201 -0.893 0.286 -0.622 ...
##  $ truemodule: chr [1:3000] "turquoise" "turquoise" "turquoise" "turquoise" ...
##  $ datME     :'data.frame':  50 obs. of  5 variables:
##   ..$ MEturquoise: num [1:50] -0.6204 0.0421 -0.9109 0.158 -0.6546 ...
##   ..$ MEblue     : num [1:50] -0.0121 0.0104 -0.801 -0.6487 -1.5827 ...
##   ..$ MEbrown    : num [1:50] 0.362 1.579 1.406 -0.297 -2.635 ...
##   ..$ MEyellow   : num [1:50] 0.1362 0.4072 -0.0697 -0.2477 0.6956 ...
##   ..$ MEgreen    : num [1:50] -0.626 0.184 -0.836 1.595 0.33 ...
dat1$datME[1:3, ]
##   MEturquoise      MEblue  MEbrown    MEyellow    MEgreen
## 1 -0.62036668 -0.01207033 0.361954  0.13622189 -0.6264538
## 2  0.04211587  0.01042166 1.578760  0.40716760  0.1836433
## 3 -0.91092165 -0.80100769 1.406360 -0.06965481 -0.8356286
dat1$datExpr[1:5, 1:3]
##            [,1]        [,2]        [,3]
## [1,] -0.7690318 -0.25281902  0.33073345
## [2,]  0.2010896  0.42931409 -0.27419755
## [3,] -0.8929247 -1.10073581  0.70294278
## [4,]  0.2861690  0.05983439  0.02233195
## [5,] -0.6224324 -0.61522751  0.80939447
datExpr = dat1$datExpr
truemodule = dat1$truemodule
datME = dat1$datME
# attach(ModuleEigengeneNetwork1)
table(truemodule)
## truemodule
##      blue     brown     green      grey turquoise    yellow 
##       450       240       120      1410       600       180
dim(datExpr)
## [1]   50 3000
datExpr=data.frame(datExpr)
ArrayName=paste("Sample",1:dim(datExpr)[[1]], sep="" )   
# The following code is useful for outputting the simulated data 
GeneName=paste("Gene",1:dim(datExpr)[[2]], sep="" )   
dimnames(datExpr)[[1]]=ArrayName
dimnames(datExpr)[[2]]=GeneName

Loading of expression data, an alternative to data simulation, provided to illustrate data loading of real data

# Load WGCNA package
library(WGCNA)
datGeneSummary=read.csv("GeneSummaryTutorial.csv")
datTraits=read.csv("TraitsTutorial.csv")
datMicroarrays=read.csv("MicroarrayDataTutorial.csv")
#=====================================================================================
#
#  Code chunk 3
#
#=====================================================================================

# This vector contains the microarray sample names
ArrayName= names(data.frame(datMicroarrays[,-1]))
 # This vector contains the gene names
GeneName= datMicroarrays$GeneName
# We transpose the data so that the rows correspond to samples and the columns correspond to genes
# Since the first column contains the gene names, we exclude it.
datExpr=data.frame(t(datMicroarrays[,-1]))
names(datExpr)=datMicroarrays[,1]
dimnames(datExpr)[[1]]=names(data.frame(datMicroarrays[,-1]))
#Also, since we simulated the data, we know the true module color:
truemodule= datGeneSummary$truemodule
rm(datMicroarrays)
collectGarbage()


#=====================================================================================
#
#  Code chunk 4
#
#=====================================================================================


# First, make sure that the array names in the file datTraits line up with those in the microarray data 
table( dimnames(datExpr)[[1]]==datTraits$ArrayName)
# Next, define the microarray sample trait 
y=datTraits$y

Basic data preprocessing illustrates rudimentary techniques for handling missing data and removing outliers

# Load WGCNA package
library(WGCNA)
# The following setting is important, do not omit.
#options(stringsAsFactors = FALSE);
# Load the previously saved data
# load("Simulated-dataSimulation.RData");
# attach(ModuleEigengeneNetwork1)
meanExpressionByArray=apply( datExpr,1,mean, na.rm=T)  
NumberMissingByArray=apply( is.na(data.frame(datExpr)),1, sum)
# sizeGrWindow(9, 5)
barplot(meanExpressionByArray,
        xlab = "Sample", ylab = "Mean expression",
        main ="Mean expression across samples",
        names.arg = c(1:50), cex.names = 0.7)

# Keep only arrays containing less than 500 missing entries
KeepArray= NumberMissingByArray<500
table(KeepArray)
## KeepArray
## TRUE 
##   50
datExpr=datExpr[KeepArray,]
y=y[KeepArray]
#ArrayName[KeepArray]
NumberMissingByGene =apply( is.na(data.frame(datExpr)),2, sum)
# One could do a barplot(NumberMissingByGene), but the barplot is empty in this case.
# It may be better to look at the numbers of missing samples using the summary method:
summary(NumberMissingByGene)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
# Calculate the variances of the probes and the number of present entries
variancedatExpr=as.vector(apply(as.matrix(datExpr),2,var, na.rm=T))
no.presentdatExpr=as.vector(apply(!is.na(as.matrix(datExpr)),2, sum) )
# Another way of summarizing the number of pressent entries
table(no.presentdatExpr)
## no.presentdatExpr
##   50 
## 3000
# Keep only genes whose variance is non-zero and have at least 4 present entries
KeepGenes= variancedatExpr>0 & no.presentdatExpr>=4
table(KeepGenes)
## KeepGenes
## TRUE 
## 3000
datExpr=datExpr[, KeepGenes]
#GeneName=GeneName[KeepGenes]
# sizeGrWindow(9, 5)
plotClusterTreeSamples(datExpr=datExpr, y=y)

Standard gene screening illustrates gene selection based on Pearson correlation and shows that the results are not satisfactory

# Load WGCNA package
library(WGCNA)
GS1= as.numeric(cor(y, datExpr, use="p"))
# Network terminology: GS1 will be referred to as signed gene significance measure
p.Standard=corPvalueFisher(GS1, nSamples =length(y) )
# since the q-value function has problems with missing data, we use the following trick
p.Standard2=p.Standard
p.Standard2[is.na(p.Standard)]=1
q.Standard=qvalue(p.Standard2)$qvalues
# Form a data frame to hold the results
StandardGeneScreeningResults=data.frame(GeneName,PearsonCorrelation=GS1, p.Standard, q.Standard)
NoiseGeneIndicator=is.element( truemodule, c("turquoise", "blue", "yellow", "grey"))+.0
SignalGeneIndicator=1-NoiseGeneIndicator
table(q.Standard<.20)
## 
## FALSE  TRUE 
##  2997     3
mean(NoiseGeneIndicator[q.Standard<=0.20]) 
## [1] 0.6666667
#=====================================================================================
#
#  Code chunk 6
#
#=====================================================================================


# save.image(file = "Simulated-StandardScreening.RData")

Construction of a weighted gene co-expression network and network modules illustrated step-by-step; includes a discussion of alternate clustering techniques

# Load WGCNA package
library(WGCNA)
# Load additional necessary packages
library(cluster)
# here we define the adjacency matrix using soft thresholding with beta=6
ADJ1=abs(cor(datExpr,use="p"))^6
# When you have relatively few genes (<5000) use the following code
k=as.vector(apply(ADJ1,2,sum, na.rm=T))
# When you have a lot of genes use the following code
k=softConnectivity(datE=datExpr,power=6) 
##  softConnectivity: FYI: connecitivty of genes with less than 17 valid samples will be returned as NA.
##  ..calculating connectivities..
# Plot a histogram of k and a scale free topology plot
#sizeGrWindow(10,5)
par(mfrow=c(1,2))
hist(k)
scaleFreePlot(k, main="Check scale free topology\n")

##   scaleFreeRsquared slope
## 1              0.91 -2.08
datExpr=datExpr[, rank(-k,ties.method="first" )<=3600]
# Turn adjacency into a measure of dissimilarity
dissADJ=1-ADJ1
dissTOM=TOMdist(ADJ1)
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
collectGarbage()
pam4=pam(as.dist(dissADJ), 4)
pam5=pam(as.dist(dissADJ), 5)
pam6=pam(as.dist(dissADJ), 6)
# Cross-tabulte the detected and the true (simulated) module membership:
table(pam4$clustering, truemodule)
##    truemodule
##     blue brown green grey turquoise yellow
##   1   51     8    17  792       530      1
##   2  380    10    34  261        37      6
##   3   12     4    17  167        17    172
##   4    7   218    52  190        16      1
table(pam5$clustering, truemodule)
##    truemodule
##     blue brown green grey turquoise yellow
##   1   50     5     2  742       524      1
##   2  373    10     3  234        35      6
##   3   11     3     2  142        15    170
##   4    9     7   107  133        16      2
##   5    7   215     6  159        10      1
table(pam6$clustering, truemodule)
##    truemodule
##     blue brown green grey turquoise yellow
##   1   39     3     1  440       310      1
##   2   21     5     3  412       235      1
##   3  363    10     3  186        26      6
##   4   11     3     2  119         8    169
##   5    9     6   106  119        14      2
##   6    7   213     5  134         7      1
pamTOM4=pam(as.dist(dissTOM), 4)
pamTOM5=pam(as.dist(dissTOM), 5)
pamTOM6=pam(as.dist(dissTOM), 6)
# Cross-tabulte the detected and the true (simulated) module membership:
table(pamTOM4$clustering, truemodule)
##    truemodule
##     blue brown green grey turquoise yellow
##   1   58    15    34 1100       576     10
##   2  384    11    48  197        21      9
##   3    3     2     4   42         2    159
##   4    5   212    34   71         1      2
table(pamTOM5$clustering, truemodule)
##    truemodule
##     blue brown green grey turquoise yellow
##   1   58    13     9 1087       574     10
##   2  384    10     7  190        20      9
##   3    0     4    97   28         3      0
##   4    3     2     0   40         2    159
##   5    5   211     7   65         1      2
table(pamTOM6$clustering, truemodule)
##    truemodule
##     blue brown green grey turquoise yellow
##   1   49    11     7  808       475      8
##   2    9     2     2  285        99      2
##   3  384    10     7  185        20      9
##   4    0     4    97   28         3      0
##   5    3     2     0   40         2    159
##   6    5   211     7   64         1      2
hierADJ=hclust(as.dist(dissADJ), method="average" )
# Plot the resulting clustering tree together with the true color assignment
#sizeGrWindow(10,5);
plotDendroAndColors(hierADJ, colors = data.frame(truemodule), dendroLabels = FALSE, hang = 0.03, 
                    main = "Gene hierarchical clustering dendrogram and simulated module colors" )

colorStaticADJ=as.character(cutreeStaticColor(hierADJ, cutHeight=.99, minSize=20))
# Plot the dendrogram with module colors
# sizeGrWindow(10,5);
plotDendroAndColors(hierADJ, colors = data.frame(truemodule, colorStaticADJ),
                    dendroLabels = FALSE, abHeight = 0.99,
                    main = "Gene dendrogram and module colors")

branch.number=cutreeDynamic(hierADJ,method="tree")
# This function transforms the branch numbers into colors
colorDynamicADJ=labels2colors(branch.number )
colorDynamicHybridADJ=labels2colors(cutreeDynamic(hierADJ,distM= dissADJ, 
                              cutHeight = 0.998, deepSplit=2, pamRespectsDendro = FALSE))
##  ..done.
# Plot results of all module detection methods together:
# sizeGrWindow(10,5)
plotDendroAndColors(dendro = hierADJ, 
                    colors=data.frame(truemodule, colorStaticADJ, 
                                     colorDynamicADJ, colorDynamicHybridADJ), 
                    dendroLabels = FALSE, marAll = c(0.2, 8, 2.7, 0.2),
                    main = "Gene dendrogram and module colors")

# Calculate the dendrogram
hierTOM = hclust(as.dist(dissTOM),method="average");
# The reader should vary the height cut-off parameter h1 
# (related to the y-axis of dendrogram) in the following
colorStaticTOM = as.character(cutreeStaticColor(hierTOM, cutHeight=.99, minSize=20))
colorDynamicTOM = labels2colors (cutreeDynamic(hierTOM,method="tree"))
colorDynamicHybridTOM = labels2colors(cutreeDynamic(hierTOM, distM= dissTOM , cutHeight = 0.998,
                       deepSplit=2, pamRespectsDendro = FALSE))
##  ..done.
# Now we plot the results
# sizeGrWindow(10,5)
plotDendroAndColors(hierTOM, 
               colors=data.frame(truemodule, colorStaticTOM, 
                                 colorDynamicTOM, colorDynamicHybridTOM), 
               dendroLabels = FALSE, marAll = c(1, 8, 3, 1),
               main = "Gene dendrogram and module colors, TOM dissimilarity")

tabStaticADJ=table(colorStaticADJ,truemodule)
tabStaticTOM=table(colorStaticTOM,truemodule)
tabDynamicADJ=table(colorDynamicADJ, truemodule)
tabDynamicTOM=table(colorDynamicTOM,truemodule)
tabDynamicHybridADJ =table(colorDynamicHybridADJ,truemodule)
tabDynamicHybridTOM =table(colorDynamicHybridTOM,truemodule)
randIndex(tabStaticADJ,adjust=F)
## [1] 0.5072126
randIndex(tabStaticTOM,adjust=F)
## [1] 0.5997922
randIndex(tabDynamicADJ,adjust=F)
## [1] 0.5033845
randIndex(tabDynamicTOM,adjust=F)
## [1] 0.5980453
randIndex(tabDynamicHybridADJ ,adjust=F)
## [1] 0.7054071
randIndex(tabDynamicHybridTOM ,adjust=F)
## [1] 0.7039642
colorh1= colorDynamicHybridTOM
# remove the dissimilarities, adjacency matrices etc to free up space
#rm(ADJ1); rm(dissADJ);              
# collectGarbage()
# save.image("Simulated-NetworkConstruction.RData")

Relating modules and module eigengenes to external data illustrates methods for relating modules to external microarray sample traits

# Load WGCNA package
library(WGCNA)
datME=moduleEigengenes(datExpr,colorh1)$eigengenes
signif(cor(datME, use="p"), 2)
##             MEblue MEbrown MEgreen MEgrey MEturquoise MEyellow
## MEblue       1.000   0.072 -0.2100  0.077      0.6700   -0.110
## MEbrown      0.072   1.000 -0.2800 -0.140      0.1200    0.079
## MEgreen     -0.210  -0.280  1.0000  0.086      0.0092   -0.096
## MEgrey       0.077  -0.140  0.0860  1.000      0.2700    0.025
## MEturquoise  0.670   0.120  0.0092  0.270      1.0000    0.099
## MEyellow    -0.110   0.079 -0.0960  0.025      0.0990    1.000
dissimME=(1-t(cor(datME, method="p")))/2
hclustdatME=hclust(as.dist(dissimME), method="average" )
# Plot the eigengene dendrogram
par(mfrow=c(1,1))
plot(hclustdatME, main="Clustering tree based of the module eigengenes")

# sizeGrWindow(8,9)
plotMEpairs(datME,y=y)
## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

signif(cor(datME, ModuleEigengeneNetwork1[,-1]),2)
##             MEturquoise MEblue MEbrown MEgreen MEyellow
## MEblue            0.680  1.000   0.066  -0.190  -0.1100
## MEbrown           0.100  0.071   0.990  -0.280   0.0840
## MEgreen           0.015 -0.210  -0.270   0.990  -0.1100
## MEgrey            0.270  0.077  -0.140   0.047   0.0083
## MEturquoise       1.000  0.660   0.110   0.021   0.1200
## MEyellow          0.099 -0.120   0.110  -0.110   0.9900
# sizeGrWindow(8,9)
par(mfrow=c(3,1), mar=c(1, 2, 4, 1))
which.module="turquoise"; 
plotMat(t(scale(datExpr[,colorh1==which.module ]) ),nrgcols=30,rlabels=T,
         clabels=T,rcols=which.module,
         title=which.module )
# for the second (blue) module we use
which.module="blue";  
plotMat(t(scale(datExpr[,colorh1==which.module ]) ),nrgcols=30,rlabels=T,
         clabels=T,rcols=which.module,
         title=which.module )
which.module="brown"; 
plotMat(t(scale(datExpr[,colorh1==which.module ]) ),nrgcols=30,rlabels=T,
         clabels=T,rcols=which.module,
         title=which.module )

# sizeGrWindow(8,7);
which.module="green"
ME=datME[, paste("ME",which.module, sep="")]
par(mfrow=c(2,1), mar=c(0.3, 5.5, 3, 2))
plotMat(t(scale(datExpr[,colorh1==which.module ]) ),
         nrgcols=30,rlabels=F,rcols=which.module,
         main=which.module, cex.main=2)
par(mar=c(5, 4.2, 0, 0.7))
barplot(ME, col=which.module, main="", cex.main=2,
        ylab="eigengene expression",xlab="array sample")

signif(cor(y,datME, use="p"),2)
##      MEblue MEbrown MEgreen MEgrey MEturquoise MEyellow
## [1,]   -0.1   -0.28    0.42  -0.18       0.056    0.099
cor.test(y, datME$MEbrown)
## 
##  Pearson's product-moment correlation
## 
## data:  y and datME$MEbrown
## t = -1.9826, df = 48, p-value = 0.05315
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.514098581  0.003495354
## sample estimates:
##      cor 
## -0.27512
p.values = corPvalueStudent(cor(y,datME, use="p"), nSamples = length(y))
GS1=as.numeric(cor(y,datExpr, use="p"))
GeneSignificance=abs(GS1)
# Next module significance is defined as average gene significance.
ModuleSignificance=tapply(GeneSignificance, colorh1, mean, na.rm=T)
# sizeGrWindow(8,7)
par(mfrow = c(1,1))
plotModuleSignificance(GeneSignificance,colorh1)

# collectGarbage()
# save.image("Simulated-RelatingToExt.RData")

Module membership, intramodular connectivity, and screening for intramodular hub genes illustrates using the intramodular connectivity to define measures of module membership and to screen for genes based on network information

# Load WGCNA package
library(WGCNA)
library(cluster)
# Load the previously saved data
# load("Simulated-RelatingToExt.RData");
ADJ1=abs(cor(datExpr,use="p"))^6
Alldegrees1=intramodularConnectivity(ADJ1, colorh1)
head(Alldegrees1)
##         kTotal  kWithin     kOut    kDiff
## Gene1 31.80186 28.37595 3.425906 24.95005
## Gene2 28.88249 26.47896 2.403522 24.07544
## Gene3 25.38600 23.11852 2.267486 20.85103
## Gene4 24.01574 22.12962 1.886122 20.24350
## Gene5 24.93663 21.69175 3.244881 18.44687
## Gene6 25.91260 23.92613 1.986469 21.93966
colorlevels=unique(colorh1)
# sizeGrWindow(9,6)
par(mfrow=c(2,as.integer(0.5+length(colorlevels)/2)))
par(mar = c(4,5,3,1))
for (i in c(1:length(colorlevels))) 
{
  whichmodule=colorlevels[[i]]; 
  restrict1 = (colorh1==whichmodule);
  verboseScatterplot(Alldegrees1$kWithin[restrict1], 
                     GeneSignificance[restrict1], col=colorh1[restrict1],
                     main=whichmodule, 
                     xlab = "Connectivity", ylab = "Gene Significance", abline = TRUE)
}

datKME=signedKME(datExpr, datME, outputColumnName="MM.")
# Display the first few rows of the data frame
head(datKME)
##          MM.blue    MM.brown     MM.green    MM.grey MM.turquoise   MM.yellow
## Gene1  0.6830511  0.11547756 -0.007124794  0.2840109    0.9481457  0.09588170
## Gene2  0.6342657  0.02257975  0.080277091  0.3029967    0.9356343  0.06889483
## Gene3 -0.6198067 -0.12531203  0.008372054 -0.2776929   -0.9121710 -0.17852211
## Gene4  0.5966736  0.06469079  0.049862112  0.2671967    0.9052030  0.11707603
## Gene5  0.6642214  0.14369720 -0.017975774  0.2442237    0.9017972 -0.01038067
## Gene6 -0.6018161 -0.15167072  0.006667131 -0.2053897   -0.9192597 -0.17138960
FilterGenes= abs(GS1)> .2 & abs(datKME$MM.brown)>.8
table(FilterGenes)
## FilterGenes
## FALSE  TRUE 
##  2989    11
dimnames(data.frame(datExpr))[[2]][FilterGenes]
##  [1] "Gene1051" "Gene1052" "Gene1053" "Gene1054" "Gene1056" "Gene1057"
##  [7] "Gene1059" "Gene1060" "Gene1061" "Gene1063" "Gene1075"
# sizeGrWindow(8,6)
par(mfrow=c(2,2))
# We choose 4 modules to plot: turquoise, blue, brown, green. 
# For simplicity we write the code out explicitly for each module.
which.color="turquoise"; 
restrictGenes=colorh1==which.color 
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes], 
                   (datKME[restrictGenes, paste("MM.", which.color, sep="")])^6,
                   col=which.color, 
                   xlab="Intramodular Connectivity", 
                   ylab="(Module Membership)^6")

which.color="blue"; 
restrictGenes=colorh1==which.color
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes],
                   (datKME[restrictGenes, paste("MM.", which.color, sep="")])^6,
                   col=which.color,
                   xlab="Intramodular Connectivity",
                   ylab="(Module Membership)^6")

which.color="brown"; 
restrictGenes=colorh1==which.color
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes],
                   (datKME[restrictGenes, paste("MM.", which.color, sep="")])^6,
                   col=which.color,
                   xlab="Intramodular Connectivity",
                   ylab="(Module Membership)^6")

which.color="green";
restrictGenes=colorh1==which.color 
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes], 
                   (datKME[restrictGenes, paste("MM.", which.color, sep="")])^6,
                   col=which.color, 
                   xlab="Intramodular Connectivity", 
                   ylab="(Module Membership)^6")

NS1=networkScreening(y=y, datME=datME, datExpr=datExpr,
         oddPower=3, blockSize=1000, minimumSampleSize=4,
         addMEy=TRUE, removeDiag=FALSE, weightESy=0.5)
## Proportion of agreement between lists based on abs(Cor.Weighted) and abs(Cor.Standard):
## Top  10  list of genes: prop. agree =  0.5
## Top  20  list of genes: prop. agree =  0.45
## Top  50  list of genes: prop. agree =  0.62
## Top  100  list of genes: prop. agree =  0.68
## Top  200  list of genes: prop. agree =  0.71
## Top  500  list of genes: prop. agree =  0.8
## Top  1000  list of genes: prop. agree =  0.814
# network screening analysis
mean(NoiseGeneIndicator[rank(NS1$p.Weighted,ties.method="first")<=100])
## [1] 0.2
# standard analysis based on the correlation p-values (or Student T test)
mean(NoiseGeneIndicator[rank(NS1$p.Standard,ties.method="first")<=100])
## [1] 0.48
topNumbers=c(10,20,50,100)
for (i in c(1:length(topNumbers)) ) 
{
  print(paste("Proportion of noise genes in the top", topNumbers[i], "list"))
  WGCNApropNoise=mean(NoiseGeneIndicator[rank(NS1$p.Weighted,ties.method="first")<=topNumbers[i]])
  StandardpropNoise=mean(NoiseGeneIndicator[rank(NS1$p.Standard,ties.method="first")<=topNumbers[i]])
  print(paste("WGCNA, proportion of noise=", WGCNApropNoise, 
        ", Standard, prop. noise=", StandardpropNoise))
  if (WGCNApropNoise< StandardpropNoise) print("WGCNA wins")
  if (WGCNApropNoise==StandardpropNoise) print("both methods tie")
  if (WGCNApropNoise>StandardpropNoise) print("standard screening wins")
}
## [1] "Proportion of noise genes in the top 10 list"
## [1] "WGCNA, proportion of noise= 0 , Standard, prop. noise= 0.4"
## [1] "WGCNA wins"
## [1] "Proportion of noise genes in the top 20 list"
## [1] "WGCNA, proportion of noise= 0.1 , Standard, prop. noise= 0.4"
## [1] "WGCNA wins"
## [1] "Proportion of noise genes in the top 50 list"
## [1] "WGCNA, proportion of noise= 0.14 , Standard, prop. noise= 0.4"
## [1] "WGCNA wins"
## [1] "Proportion of noise genes in the top 100 list"
## [1] "WGCNA, proportion of noise= 0.2 , Standard, prop. noise= 0.48"
## [1] "WGCNA wins"
# rm(dissTOM); collectGarbage()
#Form a data frame containing standard and network screening results
CorPrediction1=data.frame(GS1,NS1$cor.Weighted)
cor.Weighted=NS1$cor.Weighted
# Plot the comparison
# sizeGrWindow(8, 6)
verboseScatterplot(cor.Weighted, GS1,
         main="Network-based weighted correlation versus Pearson correlation\n",
         col=truemodule, cex.main = 1.2)
abline(0,1)

set.seed(2)
nSamples2=2000
MEgreen=rnorm(nSamples2)
scaledy2=MEgreen*ESgreen+sqrt(1-ESgreen^2)*rnorm(nSamples2)
y2=ifelse( scaledy2>median(scaledy2),2,1)
MEturquoise= ESturquoise*scaledy2+sqrt(1-ESturquoise^2)*rnorm(nSamples2)
# we simulate a strong dependence between MEblue and MEturquoise
MEblue= .6*MEturquoise+ sqrt(1-.6^2) *rnorm(nSamples2)
MEbrown= ESbrown*scaledy2+sqrt(1-ESbrown^2)*rnorm(nSamples2)
MEyellow= ESyellow*scaledy2+sqrt(1-ESyellow^2)*rnorm(nSamples2)
# Put together a data frame of eigengenes
ModuleEigengeneNetwork2=data.frame(y=y2,MEturquoise,MEblue,MEbrown,MEgreen, MEyellow)
# Simulate the expression data
dat2=simulateDatExpr5Modules(MEturquoise=ModuleEigengeneNetwork2$MEturquoise,
   MEblue=ModuleEigengeneNetwork2$MEblue,MEbrown=ModuleEigengeneNetwork2$MEbrown,
   MEyellow=ModuleEigengeneNetwork2$MEyellow,
   MEgreen=ModuleEigengeneNetwork2$MEgreen,simulateProportions=simulateProportions1, 
   nGenes=nGenes1)
# recall that this is the signed gene significance in the training data
GS1= as.numeric(cor(y, datExpr, use="p"))
# the following is the signed gene significance in the test data
GS2=as.numeric( cor(ModuleEigengeneNetwork2$y, dat2$datExpr, use="p"))
# sizeGrWindow(8,6)
par(mfrow=c(1,1))
verboseScatterplot(GS1,GS2,
       main="Trait-based gene significance in test set vs. training set\n",
       xlab = "Training set gene significance",
       ylab = "Test set gene significance",
       col=truemodule, cex.main = 1.4)

EvaluationGeneScreening1 = corPredictionSuccess(
           corPrediction = CorPrediction1, 
           corTestSet=GS2,
           topNumber=seq(from=20, to=500, length=30) )
par(mfrow=c(2,2))
listcomp = EvaluationGeneScreening1$meancorTestSetOverall
matplot(x = listcomp$topNumber,
        y = listcomp[,-1], 
        main="Predicting positive and negative correlations",
        ylab="mean cor, test data", 
        xlab="top number of genes in the training data")
listcomp= EvaluationGeneScreening1$meancorTestSetPositive
matplot(x = listcomp$topNumber,
        y = listcomp[,-1], 
        main="Predicting positive correlations",
        ylab="mean cor, test data", 
        xlab="top number of genes in the training data")
listcomp= EvaluationGeneScreening1$meancorTestSetNegative
matplot(x = listcomp$topNumber,
        y = listcomp[,-1], 
        main = "Predicting negative correlations",
        ylab = "mean cor, test data", 
        xlab = "top number of genes in the training data")

relativeCorPredictionSuccess(corPredictionNew = NS1$cor.Weighted,
                             corPredictionStandard = GS1, 
                             corTestSet=GS2,
                             topNumber=c(10,20,50,100,200,500) )
##   topNumber corPredictionNew.kruskalP
## 1        10              1.725158e-02
## 2        20              3.437721e-03
## 3        50              2.042847e-05
## 4       100              6.861281e-07
## 5       200              1.451270e-05
## 6       500              1.350232e-04
# Create a data frame holding the results of gene screening
GeneResultsNetworkScreening=data.frame(GeneName=row.names(NS1), NS1)
# Write the data frame into a file
# write.table(GeneResultsNetworkScreening, file="GeneResultsNetworkScreening.csv", row.names=F,sep=",")
# Output of eigengene information:
datMEy = data.frame(y, datME)
eigengeneSignificance = cor(datMEy, y);
eigengeneSignificance[1,1] = (1+max(eigengeneSignificance[-1, 1]))/2
eigengeneSignificance.pvalue = corPvalueStudent(eigengeneSignificance, nSamples = length(y))
namesME=names(datMEy)
# Form a summary data frame
out1=data.frame(t(data.frame(eigengeneSignificance,
eigengeneSignificance.pvalue, namesME, t(datMEy))))
# Set appropriate row names
dimnames(out1)[[1]][1]="EigengeneSignificance"
dimnames(out1)[[1]][2]="EigengeneSignificancePvalue"
dimnames(out1)[[1]][3]="ModuleEigengeneName"
dimnames(out1)[[1]][-c(1:3)]=dimnames(datExpr)[[1]]
# Write the data frame into a file
# write.table(out1, file="MEResultsNetworkScreening.csv", row.names=TRUE, col.names = TRUE, sep=",")
# Display the first few rows:
head(out1)
##                                        y       MEblue      MEbrown      MEgreen
## EigengeneSignificance         0.70970248  -0.10142300  -0.27512004   0.41940495
## EigengeneSignificancePvalue 7.909070e-09 4.834028e-01 5.315189e-02 2.431272e-03
## ModuleEigengeneName                    y       MEblue      MEbrown      MEgreen
## Sample1                      1.000000000 -0.020390790  0.069159851 -0.138749932
## Sample2                       1.00000000   0.01062292   0.24403639   0.01513903
## Sample3                      1.000000000 -0.109366048  0.232823387 -0.159918300
##                                   MEgrey  MEturquoise     MEyellow
## EigengeneSignificance        -0.17512633   0.05594979   0.09932761
## EigengeneSignificancePvalue 2.238187e-01 6.995553e-01 4.925316e-01
## ModuleEigengeneName               MEgrey  MEturquoise     MEyellow
## Sample1                     -0.066877071 -0.050387559  0.002085968
## Sample2                      -0.01593587   0.02762906   0.03332384
## Sample3                      0.071309662 -0.121035286 -0.003678822
# Write out gene information
GeneName=dimnames(datExpr)[[2]]
GeneSummary=data.frame(GeneName, truemodule, SignalGeneIndicator,  NS1)
# write.table(GeneSummary, file="GeneSummaryTutorial.csv", row.names=F,sep=",")
# here we output the module eigengenes and trait y without eigengene significances
datTraits=data.frame(ArrayName, datMEy)
dimnames(datTraits)[[2]][2:length(namesME)]=paste("Trait",  
                                             dimnames(datTraits)[[2]][2:length(namesME)], 
                                             sep=".")
# write.table(datTraits, file="TraitsTutorial.csv", row.names=F,sep=",")
rm(datTraits)
# here we output the simulated gene expression data
MicroarrayData=data.frame(GeneName, t(datExpr))
names(MicroarrayData)[-1]=ArrayName
# write.table(MicroarrayData, file="MicroarrayDataTutorial.csv", row.names=F,sep=",")
rm(MicroarrayData)
# Perform network screening
NS1GS=networkScreeningGS(datExpr=datExpr, datME = datME, GS=GS1)
## Proportion of agreement between GS.Weighted and GS:
## Top  10  list of genes: prop. of agreement =  0.3
## Top  20  list of genes: prop. of agreement =  0.4
## Top  50  list of genes: prop. of agreement =  0.36
## Top  100  list of genes: prop. of agreement =  0.33
## Top  200  list of genes: prop. of agreement =  0.32
## Top  500  list of genes: prop. of agreement =  0.424
## Top  1000  list of genes: prop. of agreement =  0.558
# Organize its results for easier plotting
GSprediction1=data.frame(GS1,NS1GS$GS.Weighted)
GS.Weighted=NS1GS$GS.Weighted
# Plot a comparison between standard gene significance and network-weighted gene significance
# sizeGrWindow(8, 6)
par(mfrow=c(1,1))
verboseScatterplot(GS1, GS.Weighted, 
                   main="Weighted gene significance vs. the standard GS\n",
                   col=truemodule)
abline(0,1)

EvaluationGeneScreeningGS = corPredictionSuccess(corPrediction=GSprediction1, corTestSet=GS2,
                                    topNumber=seq(from=20, to=500, length=30) )
# sizeGrWindow(8, 6)
par(mfrow=c(2,2))
listcomp= EvaluationGeneScreeningGS$meancorTestSetOverall
matplot(x=listcomp$topNumber,
        y=listcomp[,-1],
        main="Predicting positive and negative correlations",
        ylab="mean cor, test data",
        xlab="top number of genes in the training data")
listcomp= EvaluationGeneScreeningGS$meancorTestSetPositive
matplot(x=listcomp$topNumber, 
        y=listcomp[,-1], 
        main="Predicting positive correlations",
        ylab="mean cor, test data",
        xlab="top number of genes in the training data")
listcomp= EvaluationGeneScreeningGS$meancorTestSetNegative
matplot(x=listcomp$topNumber,
        y=listcomp[,-1],
        main="Predicting negative correlations",
        ylab="mean cor, test data",
        xlab="top number of genes in the training data")

# collectGarbage()
# save.image("Simulated-Screening.RData")

Visualization of gene networks

# Load WGCNA package
library(WGCNA)
library(cluster)
# Load the previously saved data
# load("Simulated-RelatingToExt.RData"); 
# load("Simulated-Screening.RData")
cmd1=cmdscale(as.dist(dissTOM),2)
# sizeGrWindow(7, 6)
par(mfrow=c(1,1))
plot(cmd1, col=as.character(colorh1),  main="MDS plot",
     xlab="Scaling Dimension 1", ylab="Scaling Dimension 2")

power=6
color1=colorDynamicTOM
restGenes= (color1 != "grey")
diss1=1-TOMsimilarityFromExpr( datExpr[, restGenes], power = 6 )
## TOM calculation: adjacency..
## ..will not use multithreading.
##  Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
hier1=hclust(as.dist(diss1), method="average" )
diag(diss1) = NA;
# sizeGrWindow(7,7)
TOMplot(diss1^4, hier1, as.character(color1[restGenes]),
        main = "TOM heatmap plot, module genes" )

power=6
color1=colorDynamicTOM
restGenes= (color1 != "grey")
diss1=1-adjacency( datExpr[, restGenes], power = 6 )
hier1=hclust(as.dist(diss1), method="average" )
diag(diss1) = NA;
# sizeGrWindow(7,7)
TOMplot(diss1^4, hier1, as.character(color1[restGenes]),
        main = "Adjacency heatmap plot, module genes" )

# sizeGrWindow(7,7)
topList=rank(NS1$p.Weighted,ties.method="first")<=30
gene.names= names(datExpr)[topList]
# The following shows the correlations between the top genes
plotNetworkHeatmap(datExpr, plotGenes = gene.names,
                   networkType="signed", useTOM=FALSE,
                   power=1, main="signed correlations")

# sizeGrWindow(7,7)
# The following shows the correlations between the top genes
plotNetworkHeatmap(datExpr, plotGenes = gene.names,
                   networkType="unsigned", useTOM=FALSE,
                   power=1, main="signed correlations")

# sizeGrWindow(7,7)
# The following shows the TOM heatmap in a signed network
plotNetworkHeatmap(datExpr, plotGenes = gene.names,
                   networkType="signed", useTOM=TRUE,
                   power=12, main="C. TOM in a signed network")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.

# The following shows the TOM heatmap in a unsigned network
plotNetworkHeatmap(datExpr, plotGenes = gene.names,
                   networkType="unsigned", useTOM=TRUE,
                   power=6, main="D. TOM in an unsigned network")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.